## Replication code for Estimating Candidates’ Political Orientation in a Polarized Congress
## We use R for 3.2.4 GUI 1.67 Mavericks build (7152), running on Mac OS 10.12.1 


# Recommended: run with source("Code/rollcalls_PCP_APRE.R")

# Table 4:
# run with chamber="house"; time="contemporaneous"
# Table 5:
# run with chamber="house"; time="prospective"
# Table 6:
# run with chamber="senate"; time="contemporaneous"
# Table 7:
# run with chamber="senate"; time="prospective"

library(foreign) ## We use version: foreign_0.8-66
library(xtable)  ## We use version: xtable_1.8-2



# load the estimates
load(file=paste("candidatepositions",chamber,"analysis.RData",sep="_"))
load(file="OptimalClassification_results.RData")
if (chamber=="house"){
	analysis_all<-merge(analysis_all, house_oc_results, by.x=c("ICPSR2","cong_contemporaneous"), by.y=c("ICPSR2","cong"), all.x=T)
}
if (chamber=="senate"){
	analysis_all<-merge(analysis_all, senate_oc_results, by.x=c("ICPSR2","cong_contemporaneous"), by.y=c("ICPSR2","cong"), all.x=T)
}
cor(analysis_all$dwnom1_contemporaneous, analysis_all$OC, use="complete.obs")

estimates <- analysis_all
rm(analysis_all)
estimates$cong <- estimates[[paste("cong",time,sep="_")]]
estimates$js_ideology <- estimates[[paste("js_ideology",time,sep="_")]]


# create a list containing all the roll call matrices
if(chamber=="house"){
#datanames=c("hou107kh.dta","hou108kh.dta","hou109kh.dta","hou110kh.dta","hou111kh.dta", "hou112kh.dta", "hou113kh.dta")
#datasets = list(read.dta("Roll Calls/hou107kh.dta"),read.dta("Roll Calls/hou108kh.dta"),read.dta("Roll Calls/hou109kh.dta"),read.dta("Roll Calls/hou110kh.dta"),read.dta("Roll Calls/hou111kh.dta"),read.dta("Roll Calls/hou112kh.dta"),read.dta("Roll Calls/hou113kh.dta"))
#names(datasets) <- datanames
#save(datanames,datasets,file="house_rollcalls.RData")
load(file="house_rollcalls.RData")
}
if(chamber=="senate"){
#datanames=c("sen107kh.dta","sen108kh.dta","sen109kh.dta","sen110kh.dta","sen111kh.dta", "sen112kh.dta", "sen113kh.dta")
#datasets = list(read.dta("Roll Calls/sen107kh.dta"),read.dta("Roll Calls/sen108kh.dta"),read.dta("Roll Calls/sen109kh.dta"),read.dta("Roll Calls/sen110kh.dta"),read.dta("Roll Calls/sen111kh.dta"),read.dta("Roll Calls/sen112kh.dta"),read.dta("Roll Calls/sen113kh.dta"))
#names(datasets) <- datanames
#save(datanames,datasets,file="senate_rollcalls.RData")
load(file="senate_rollcalls.RData")
}

# this list will contain estimates matched to each roll call matrix
matched_estimates <- list()

### Functions for summarizing predictions


correct_pred<- function(reg,votes,iv){
# computes the number of correct predictions for a given
# model, vote, and independent variable
    temp<-predict(reg,type="response")
    correct<-rep(NA, length(temp))
    correct[temp >= .5 & votes==1 & !is.na(iv)]<-1
    correct[temp<.5 & votes==0 & !is.na(iv)]<-1
    correct[temp >= .5 & votes==0 & !is.na(iv)]<-0
    correct[temp<.5 & votes==1 & !is.na(iv)]<-0
    sum(correct)
}

incorrect_pred<- function(reg,votes,iv){
# computes the number of *incorrect* predictions for a given
# model, vote, and independent variable
    temp<-predict(reg,type="response")
    incorrect<-rep(NA, length(temp))
    incorrect[temp >= .5 & votes==1 & !is.na(iv)]<-0
    incorrect[temp<.5 & votes==0 & !is.na(iv)]<-0
    incorrect[temp >= .5 & votes==0 & !is.na(iv)]<-1
    incorrect[temp<.5 & votes==1 & !is.na(iv)]<-1
    sum(incorrect)
}

correct_naive <- function(votes){
# computes the number of correct predictions we would get from a naive model
# that assumes that everyone votes with the majority
    max(0,max(table(votes),na.rm=T),na.rm=T)
}

total_votes <- function(votes){
# returns the number of votes cast
    sum(sum(table(votes)),0,na.rm=T)
}

total_leg <- function(votes){
# returns the total number of legislators voting
    if(is.null(votes)){return(0)}
    # this next line returns 1 legislator if "votes"
    # is a vector, which assumes there is not only
    # 1 vote.
    if(is.null(dim(votes))){return(1)}
    sum(! is.na(rowSums(votes,na.rm=T)))
}

not_unanimous <- function(x){
# returns TRUE if a vote is not unanimous
    ifelse(is.na(table(x)[2]),F,T)
}

robust_apply <- function(X,dim,fun){
# a wrapper for apply that handles vectors *and* matrices
    if(is.null(dim(X))){
        return(fun(X))
    } else {
        return(apply(X,dim,fun))
    }
}

# loop over each dataset,
# create a version of the estimates matched to each roll call matrix
# and recode the roll calls
for(dataname in datanames){
    print(dataname)
    
    ## Select a congress of the data
    rollcalls <- datasets[[dataname]]

    # find the legislator id in the roll call matrix
    idno<-grep("idno", colnames(rollcalls))
    if(length(idno)!=0){colnames(rollcalls)[colnames(rollcalls)=="idno"]<-"id"}
    # and use it to sort a matched version of the estimate matrix
    temp <- estimates[which(estimates$cong==rollcalls$cong[1]),]
    matched_estimates[[dataname]] <- temp[match(rollcalls$id,temp$ICPSR2),]
    
    grab<-grep(c("V"),colnames(rollcalls))
    if(length(grab)==0){grab<-grep(c("var"),colnames(rollcalls))}
    rollcalls<-rollcalls[,grab]
    rollcalls[rollcalls==0]<-NA
    rollcalls[rollcalls==7]<-NA
    rollcalls[rollcalls==8]<-NA
    rollcalls[rollcalls==9]<-NA
    rollcalls[rollcalls==2]<-1
    rollcalls[rollcalls==3]<-1
    rollcalls[rollcalls==4]<-0
    rollcalls[rollcalls==5]<-0
    rollcalls[rollcalls==6]<-0

    #remove unanimous votes
    rollcalls <- rollcalls[,apply(rollcalls,2,not_unanimous)]

    # save result back to list of datasets
    datasets[[dataname]] <- as.matrix(rollcalls)
}

correct <- list()
incorrect <- list() #error checking
correct_party <- list()
incorrect_party <- list() #error checking
naive <- list()
total <- list()
legnum <- list()

# loop over the roll call datasets
for(dataname in datanames){
    print(dataname)
    # subset to just the rollcalls for one congress
    rollcalls <- datasets[[dataname]]
    correct[[dataname]] <- list()
    incorrect[[dataname]] <- list()
    correct_party[[dataname]] <- list()
    incorrect_party[[dataname]] <- list()
    naive[[dataname]] <- list()
    total[[dataname]] <- list()
    # loop over the estimates
    for (j in 1:10){
        print(j)
        # select estimate for this analysis
        if (j==1){iv<-as.factor(matched_estimates[[dataname]]$party_nominate)} #Party)}
        if (j==2){iv<-as.numeric(matched_estimates[[dataname]]$cfscores.dyn)} #.dyn
        if (j==3){iv<-as.numeric(matched_estimates[[dataname]]$js_ideology)}
        if (j==4){iv<-as.numeric(matched_estimates[[dataname]]$twitter_idealpoint)}
        if (j==5){iv<-as.numeric(matched_estimates[[dataname]]$npat_score)}
        if (j==6){iv<-as.numeric(matched_estimates[[dataname]]$am_estimate)}
        if (j==7){iv<-as.numeric(matched_estimates[[dataname]]$shor_mccarty_score)}
        if (j==8){iv<-as.numeric(matched_estimates[[dataname]]$dwnom1_contemporaneous)}
        if (j==9){iv<-as.numeric(matched_estimates[[dataname]]$am_estimate_tw)}
         if (j==10){iv<-as.numeric(matched_estimates[[dataname]]$OC)}
       # vector of legislator party
        party <- as.factor(matched_estimates[[dataname]]$party_nominate)#Party)
        # calculate the naive votes correctly predicted,
        # where the estimates are available
        naive[[dataname]][[j]] <- robust_apply(rollcalls[! is.na(iv),],2,correct_naive)
        # calculated the total non-NA votes,
        # where the estimates are available
        total[[dataname]][[j]] <- robust_apply(rollcalls[! is.na(iv),],2,total_votes)
        # calculate the total non-NA legislators
        legnum[[dataname]][[j]] <- total_leg(rollcalls[! is.na(iv),])
        # the number of roll calls
        vote_num <- ifelse(is.null(rollcalls),0,
                           ifelse(is.null(dim(rollcalls)),1,
                                  dim(rollcalls)[2]))
        # vector to store counts of correctly predicted votes
        correct[[dataname]][[j]] <- rep(NA,vote_num)
        incorrect[[dataname]][[j]] <- rep(NA,vote_num) #error checking
        # vector to store counts of correctly predicted votes by party
        correct_party[[dataname]][[j]] <- rep(NA,vote_num)
        incorrect_party[[dataname]][[j]] <- rep(NA,vote_num) #error checking
        # loop over roll calls
        if(length(which(! is.na(iv))) > 1 & ! is.null(dim(rollcalls))){
        for(i in 1:vote_num){
            # figure out when both the vote and estimate are not NA
            notna <- ! is.na(rollcalls[,i])  & ! is.na(iv)

            # logit of votes on estimates, excluding NAs for either
            if(! all(! notna)){
            if(length(unique(iv[notna]))>1){ reg<-glm(rollcalls[notna,i]~iv[notna],family=binomial(link = "logit"))
            } else {
            reg<-glm(rollcalls[notna,i]~1,family=binomial(link = "logit"))}
            
            # how many of the predicted values are correct, total?
            correct[[dataname]][[j]][i] <- correct_pred(reg,rollcalls[notna,i],iv[notna])
            incorrect[[dataname]][[j]][i] <- incorrect_pred(reg,rollcalls[notna,i],iv[notna]) #error checking
            
            # now repeat this for this set of votes, but using party
            # as the predictor.
            if(length(unique(party[notna]))>1){ reg<-glm(rollcalls[notna,i]~party[notna],family=binomial(link = "logit"))
            } else {
            reg<-glm(rollcalls[notna,i]~1,family=binomial(link = "logit"))}
            correct_party[[dataname]][[j]][i] <- correct_pred(reg,rollcalls[notna,i],party[notna])
            incorrect_party[[dataname]][[j]][i] <- incorrect_pred(reg,rollcalls[notna,i],party[notna]) #error checking
        }}
    }
    }
}

# this array wil contain the results for each ideology measure, measure of accuracy, and congressional session
arr <- array(NA,dim=c(10,6,length(datanames)),
             dimnames=list(
                 c("Party","CF-Score","Experts","Twitter","NPAT",
                   "Survey","State Leg.","DW-Nominate","TW Survey", "Optimal Classification"),
                 c("APRE","PCP","Party PCP","Votes","Legislators","Improvement over party"),
                 datanames))

for(dataname in datanames){
    print(dataname)
    for(j in 1:10){
        thistotal <- sum(total[[dataname]][[j]])
        thisnaive <- sum(naive[[dataname]][[j]])
        thispred <- sum(correct[[dataname]][[j]])
        thispred_party <- sum(correct_party[[dataname]][[j]])
        naive_error <- thistotal - thisnaive
        model_error <- thistotal - thispred
        ind <- cbind(j,1,which(datanames==dataname))
        arr[ind] <- (naive_error - model_error)/naive_error
        ind <- cbind(j,2,which(datanames==dataname))
        arr[ind] <- thispred/thistotal
        ind <- cbind(j,3,which(datanames==dataname))
        arr[ind] <- thispred_party/thistotal
        ind <- cbind(j,4,which(datanames==dataname))
        arr[ind] <- as.integer(thistotal) 
        ind <- cbind(j,5,which(datanames==dataname))
        arr[ind] <- legnum[[dataname]][[j]]
        ind <- cbind(j,6,which(datanames==dataname))
        arr[ind] <- 1-(1-as.numeric(arr[j,2,which(datanames==dataname)]))/(1-as.numeric(arr[j,3,which(datanames==dataname)]))
    }
}

arr


# this code turns the result into a table for each congress
tmp <- arr[,,1]
# Format numbers
tmp[,1:3] <- paste(round(tmp[,1:3]*100,1),"%",sep="")
tmp[,6] <- paste(round(arr[,6,1]*100,1),"%",sep="")
# Select only non-NA rows and desired columns
tmp <- tmp[rowSums(is.na(arr[,,1]))==0,c(1,2,3,5,6)]
# Save session-by-session results:
#cat(print(xtable(tmp)),file=paste("apre_results_",chamber,
#                           "_",time,".txt",sep=""))

# Iterate over the remaining congresses, and make this table
# again for each one.
for(i in 2:length(datanames)){
    tmp <- arr[,,i]
    # Format numbers
    tmp[,1:3] <- paste(round(tmp[,1:3]*100,1),"%",sep="")
    tmp[,6] <- paste(round(arr[,6,i]*100,1),"%",sep="")
    # Select only non-NA rows and desired columns
    tmp <- tmp[rowSums(is.na(arr[,,i]))==0,c(1,2,3,5,6)]
    cat(print(xtable(tmp)),file=paste("apre_results_",chamber,
                           "_",time,".txt",sep=""),append=T)
}

### This code aggregates the results by congress, weighting by the number of votes cast
sumatrix <- matrix(NA,10,7)
rownames(sumatrix) <- c("Party","CF-Score","Experts","Twitter","NPAT","Survey","State Leg.","DW-Nominate","TW Survey", "Optimal Classification")
colnames(sumatrix) <- c("APRE","PCP","PCP Party","Votes","Legislator-Sessions","Improvement over party*","Improvement over party")

for(j in 1:10){
    temp <- arr[j,,]
    sumatrix[j,1] <- weighted.mean(temp[1,],temp[4,],na.rm=T)
    sumatrix[j,2] <- weighted.mean(temp[2,],temp[4,],na.rm=T)
    sumatrix[j,3] <- weighted.mean(temp[3,],temp[4,],na.rm=T)
    sumatrix[j,4] <- sum(temp[4,],na.rm=T)
    sumatrix[j,5] <- sum(temp[5,],na.rm=T)
    sumatrix[j,6] <- weighted.mean(temp[6,],temp[4,],na.rm=T)
    sumatrix[j,7] <- 1-(1-sumatrix[j,2])/(1-sumatrix[j,3])
}
sumatrix

# Makes the results into a Table, as in Table 4-7
tmp <- sumatrix
# Format numbers
tmp[,1:3] <- paste(round(tmp[,1:3]*100,1),"%",sep="")
tmp[,6] <- paste(round(sumatrix[,6]*100,1),"%",sep="")
tmp[,7] <- paste(round(sumatrix[,7]*100,1),"%",sep="")
# Select only NA rows and desired columns
tmp <- tmp[rowSums(is.na(sumatrix))==0,c(2,3,5,7)]
if(chamber == "house" & time == "contemporaneous"){filename <- "Table4.txt"}
if(chamber == "house" & time == "prospective"){filename <- "Table5.txt"}
if(chamber == "senate" & time == "contemporaneous"){filename <- "Table6.txt"}
if(chamber == "senate" & time == "prospective"){filename <- "Table7.txt"}
cat(print(xtable(tmp)),file=filename)


###
### error checking
###

while(0){
for (dataname in datanames){
   for (j in 1:10){
   rollcalls <- datasets[[dataname]]
   vote_num <- ifelse(is.null(rollcalls),0,
                      ifelse(is.null(dim(rollcalls)),1,
                      dim(rollcalls)[2]))
      for(i in 1:vote_num){
         test <- correct[[dataname]][[j]][i] + incorrect[[dataname]][[j]][i] == total[[dataname]][[j]][i]
         if(! is.na(test)){ if(! test){
         print(paste("Sum error",dataname,j,i,sep=","))}}
         if(is.na(test)){if(! all(is.na(c(correct[[dataname]][[j]][i], incorrect[[dataname]][[j]][i]))) | ! total[[dataname]][[j]][i] %in% c(0,NA)){
         print(paste("NA error",dataname,j,i,sep=","))
         stop()}}          
}
}
}
}
